ddi <- read_ipums_ddi("atus_00005.xml")
csv <- read_ipums_micro(ddi) %>%
clean_names()
## Use of data from IPUMS ATUS is subject to conditions including that users
## should cite the data appropriately. Use command `ipums_conditions()` for more
## details.
data
## function (..., list = character(), package = NULL, lib.loc = NULL,
## verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE)
## {
## fileExt <- function(x) {
## db <- grepl("\\.[^.]+\\.(gz|bz2|xz)$", x)
## ans <- sub(".*\\.", "", x)
## ans[db] <- sub(".*\\.([^.]+\\.)(gz|bz2|xz)$", "\\1\\2",
## x[db])
## ans
## }
## my_read_table <- function(...) {
## lcc <- Sys.getlocale("LC_COLLATE")
## on.exit(Sys.setlocale("LC_COLLATE", lcc))
## Sys.setlocale("LC_COLLATE", "C")
## read.table(...)
## }
## stopifnot(is.character(list))
## names <- c(as.character(substitute(list(...))[-1L]), list)
## if (!is.null(package)) {
## if (!is.character(package))
## stop("'package' must be a character vector or NULL")
## }
## paths <- find.package(package, lib.loc, verbose = verbose)
## if (is.null(lib.loc))
## paths <- c(path.package(package, TRUE), if (!length(package)) getwd(),
## paths)
## paths <- unique(normalizePath(paths[file.exists(paths)]))
## paths <- paths[dir.exists(file.path(paths, "data"))]
## dataExts <- tools:::.make_file_exts("data")
## if (length(names) == 0L) {
## db <- matrix(character(), nrow = 0L, ncol = 4L)
## for (path in paths) {
## entries <- NULL
## packageName <- if (file_test("-f", file.path(path,
## "DESCRIPTION")))
## basename(path)
## else "."
## if (file_test("-f", INDEX <- file.path(path, "Meta",
## "data.rds"))) {
## entries <- readRDS(INDEX)
## }
## else {
## dataDir <- file.path(path, "data")
## entries <- tools::list_files_with_type(dataDir,
## "data")
## if (length(entries)) {
## entries <- unique(tools::file_path_sans_ext(basename(entries)))
## entries <- cbind(entries, "")
## }
## }
## if (NROW(entries)) {
## if (is.matrix(entries) && ncol(entries) == 2L)
## db <- rbind(db, cbind(packageName, dirname(path),
## entries))
## else warning(gettextf("data index for package %s is invalid and will be ignored",
## sQuote(packageName)), domain = NA, call. = FALSE)
## }
## }
## colnames(db) <- c("Package", "LibPath", "Item", "Title")
## footer <- if (missing(package))
## paste0("Use ", sQuote(paste("data(package =", ".packages(all.available = TRUE))")),
## "\n", "to list the data sets in all *available* packages.")
## else NULL
## y <- list(title = "Data sets", header = NULL, results = db,
## footer = footer)
## class(y) <- "packageIQR"
## return(y)
## }
## paths <- file.path(paths, "data")
## for (name in names) {
## found <- FALSE
## for (p in paths) {
## tmp_env <- if (overwrite)
## envir
## else new.env()
## if (file_test("-f", file.path(p, "Rdata.rds"))) {
## rds <- readRDS(file.path(p, "Rdata.rds"))
## if (name %in% names(rds)) {
## found <- TRUE
## if (verbose)
## message(sprintf("name=%s:\t found in Rdata.rds",
## name), domain = NA)
## thispkg <- sub(".*/([^/]*)/data$", "\\1", p)
## thispkg <- sub("_.*$", "", thispkg)
## thispkg <- paste0("package:", thispkg)
## objs <- rds[[name]]
## lazyLoad(file.path(p, "Rdata"), envir = tmp_env,
## filter = function(x) x %in% objs)
## break
## }
## else if (verbose)
## message(sprintf("name=%s:\t NOT found in names() of Rdata.rds, i.e.,\n\t%s\n",
## name, paste(names(rds), collapse = ",")),
## domain = NA)
## }
## if (file_test("-f", file.path(p, "Rdata.zip"))) {
## warning("zipped data found for package ", sQuote(basename(dirname(p))),
## ".\nThat is defunct, so please re-install the package.",
## domain = NA)
## if (file_test("-f", fp <- file.path(p, "filelist")))
## files <- file.path(p, scan(fp, what = "", quiet = TRUE))
## else {
## warning(gettextf("file 'filelist' is missing for directory %s",
## sQuote(p)), domain = NA)
## next
## }
## }
## else {
## files <- list.files(p, full.names = TRUE)
## }
## files <- files[grep(name, files, fixed = TRUE)]
## if (length(files) > 1L) {
## o <- match(fileExt(files), dataExts, nomatch = 100L)
## paths0 <- dirname(files)
## paths0 <- factor(paths0, levels = unique(paths0))
## files <- files[order(paths0, o)]
## }
## if (length(files)) {
## for (file in files) {
## if (verbose)
## message("name=", name, ":\t file= ...", .Platform$file.sep,
## basename(file), "::\t", appendLF = FALSE,
## domain = NA)
## ext <- fileExt(file)
## if (basename(file) != paste0(name, ".", ext))
## found <- FALSE
## else {
## found <- TRUE
## zfile <- file
## zipname <- file.path(dirname(file), "Rdata.zip")
## if (file.exists(zipname)) {
## Rdatadir <- tempfile("Rdata")
## dir.create(Rdatadir, showWarnings = FALSE)
## topic <- basename(file)
## rc <- .External(C_unzip, zipname, topic,
## Rdatadir, FALSE, TRUE, FALSE, FALSE)
## if (rc == 0L)
## zfile <- file.path(Rdatadir, topic)
## }
## if (zfile != file)
## on.exit(unlink(zfile))
## switch(ext, R = , r = {
## library("utils")
## sys.source(zfile, chdir = TRUE, envir = tmp_env)
## }, RData = , rdata = , rda = load(zfile,
## envir = tmp_env), TXT = , txt = , tab = ,
## tab.gz = , tab.bz2 = , tab.xz = , txt.gz = ,
## txt.bz2 = , txt.xz = assign(name, my_read_table(zfile,
## header = TRUE, as.is = FALSE), envir = tmp_env),
## CSV = , csv = , csv.gz = , csv.bz2 = ,
## csv.xz = assign(name, my_read_table(zfile,
## header = TRUE, sep = ";", as.is = FALSE),
## envir = tmp_env), found <- FALSE)
## }
## if (found)
## break
## }
## if (verbose)
## message(if (!found)
## "*NOT* ", "found", domain = NA)
## }
## if (found)
## break
## }
## if (!found) {
## warning(gettextf("data set %s not found", sQuote(name)),
## domain = NA)
## }
## else if (!overwrite) {
## for (o in ls(envir = tmp_env, all.names = TRUE)) {
## if (exists(o, envir = envir, inherits = FALSE))
## warning(gettextf("an object named %s already exists and will not be overwritten",
## sQuote(o)))
## else assign(o, get(o, envir = tmp_env, inherits = FALSE),
## envir = envir)
## }
## rm(tmp_env)
## }
## }
## invisible(names)
## }
## <bytecode: 0x156bbd238>
## <environment: namespace:utils>
#csv <- read_csv(file = "atus_00001.csv.gz") %>%
# clean_names()
# race
race_file <- read_excel("race_980x.xlsx")
race_file <- race_file %>%
rename(code1 = race) %>%
rename(race = code)
csv <- left_join(csv, race_file, by = "race") %>%
rename(race_code = race) %>%
rename(race = code1)
# education
education_file <- read_excel("Education.xlsx")
education_file <- education_file %>%
rename(code_educ = educ) %>%
rename(educ = code_education) %>%
mutate(educ = as.numeric(educ))
csv <- left_join(csv, education_file, by = "educ") %>%
rename(code_education = educ) %>%
rename(educ = code_educ)
# marital status
marital_file <- read_excel("marital_status.xlsx")
education_file <- marital_file %>%
mutate(marst = as.numeric(marst))
csv <- left_join(csv, marital_file, by = "marst") %>%
rename(code_marst = marst) %>%
rename(marst = marst_code) %>%
mutate(marst_simple = ifelse(code_marst <= 2, "Married", ifelse(code_marst >= 6, "Never Married", "Separated/Divorced")))
# age
csv <- csv %>%
filter(age > 18)
# children
csv <- csv %>%
mutate(child = ifelse(yngch < 19, 1, 0) )
# gender
csv <- csv %>%
mutate(gender = ifelse(sex == 1, "male", "female"))
# years
csv %>%
summary(earnweek)
## year caseid pernum lineno wt06
## Min. :2003 Min. :2.003e+13 Min. :1 Min. :1 Min. : 419472
## 1st Qu.:2006 1st Qu.:2.006e+13 1st Qu.:1 1st Qu.:1 1st Qu.: 2932235
## Median :2011 Median :2.011e+13 Median :1 Median :1 Median : 5230098
## Mean :2011 Mean :2.011e+13 Mean :1 Mean :1 Mean : 7201674
## 3rd Qu.:2016 3rd Qu.:2.016e+13 3rd Qu.:1 3rd Qu.:1 3rd Qu.: 8816158
## Max. :2021 Max. :2.021e+13 Max. :1 Max. :1 Max. :209010030
## NA's :8469
## wt20 age sex race_code
## Min. : 0 Min. :19.00 Min. :1.000 Min. :100
## 1st Qu.: 3606315 1st Qu.:36.00 1st Qu.:1.000 1st Qu.:100
## Median : 6475784 Median :48.00 Median :2.000 Median :100
## Mean : 8746744 Mean :49.48 Mean :1.563 Mean :104
## 3rd Qu.: 11222184 3rd Qu.:62.00 3rd Qu.:2.000 3rd Qu.:100
## Max. :137151708 Max. :85.00 Max. :2.000 Max. :599
## NA's :199330
## code_marst code_education educyrs empstat_cps8 fullpart
## Min. :1.000 Min. :10.0 Min. :101.0 Min. :1.00 Min. : 1.00
## 1st Qu.:1.000 1st Qu.:21.0 1st Qu.:112.0 1st Qu.:1.00 1st Qu.: 1.00
## Median :1.000 Median :30.0 Median :214.0 Median :1.00 Median : 1.00
## Mean :2.764 Mean :29.7 Mean :190.4 Mean :2.72 Mean :37.06
## 3rd Qu.:4.000 3rd Qu.:40.0 3rd Qu.:216.0 3rd Qu.:5.00 3rd Qu.:99.00
## Max. :6.000 Max. :43.0 Max. :321.0 Max. :7.00 Max. :99.00
##
## earnweek eldch yngch nchild
## Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. :0.0000
## 1st Qu.: 673.1 1st Qu.:13.0 1st Qu.: 9.00 1st Qu.:0.0000
## Median : 1846.2 Median :99.0 Median :99.00 Median :0.0000
## Mean : 44392.0 Mean :60.8 Mean :59.44 Mean :0.8366
## 3rd Qu.:100000.0 3rd Qu.:99.0 3rd Qu.:99.00 3rd Qu.:2.0000
## Max. :100000.0 Max. :99.0 Max. :99.00 Max. :9.0000
##
## bls_carehh bls_comm bls_educ bls_food
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 30.00
## Median : 0.00 Median : 0.00 Median : 0.000 Median : 60.00
## Mean : 38.93 Mean : 10.74 Mean : 8.373 Mean : 75.24
## 3rd Qu.: 30.00 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 100.00
## Max. :1230.00 Max. :1200.00 Max. :1190.000 Max. :1320.00
##
## bls_hhact bls_leis bls_leis_arts bls_leis_attend
## Min. : 0.0 Min. : 0.0 Min. : 0.000 Min. : 0.000
## 1st Qu.: 15.0 1st Qu.: 155.0 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 75.0 Median : 294.0 Median : 0.000 Median : 0.000
## Mean : 124.9 Mean : 329.4 Mean : 5.879 Mean : 6.128
## 3rd Qu.: 185.0 3rd Qu.: 475.0 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :1405.0 Max. :1439.0 Max. :1015.000 Max. :860.000
##
## bls_leis_attsport bls_leis_partsport bls_leis_relax bls_leis_soc
## Min. : 0.000 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 90.0 1st Qu.: 130.0
## Median : 0.000 Median : 0.00 Median : 195.0 Median : 255.0
## Mean : 1.832 Mean : 16.25 Mean : 245.3 Mean : 297.7
## 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 356.0 3rd Qu.: 425.0
## Max. :1030.000 Max. :1125.00 Max. :1439.0 Max. :1439.0
##
## bls_leis_soccom bls_leis_soccomex bls_leis_sport bls_leis_travel
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.00
## Mean : 46.38 Mean : 40.22 Mean : 18.15 Mean : 13.54
## 3rd Qu.: 60.00 3rd Qu.: 45.00 3rd Qu.: 0.00 3rd Qu.: 12.00
## Max. :1350.00 Max. :1350.00 Max. :1125.00 Max. :1350.00
##
## bls_leis_tv bls_pcare bls_purch bls_social
## Min. : 0.0 Min. : 0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 40.0 1st Qu.: 490 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 122.0 Median : 564 Median : 0.00 Median : 0.00
## Mean : 176.9 Mean : 574 Mean : 49.23 Mean : 25.12
## 3rd Qu.: 255.0 3rd Qu.: 645 3rd Qu.: 73.00 3rd Qu.: 0.00
## Max. :1433.0 Max. :1440 Max. :1320.00 Max. :1325.00
##
## bls_work race educ marst
## Min. : 0.0 Length:216908 Length:216908 Length:216908
## 1st Qu.: 0.0 Class :character Class :character Class :character
## Median : 0.0 Mode :character Mode :character Mode :character
## Mean : 176.7
## 3rd Qu.: 438.2
## Max. :1430.0
##
## marst_simple child gender
## Length:216908 Min. :0.000 Length:216908
## Class :character 1st Qu.:0.000 Class :character
## Mode :character Median :0.000 Mode :character
## Mean :0.385
## 3rd Qu.:1.000
## Max. :1.000
##
# employed
csv <- csv %>%
mutate(empstat_cps8 = as.numeric(empstat_cps8)) %>%
mutate(emp = ifelse(empstat_cps8<= 2, "Employed", ifelse(empstat_cps8>=5, "Not In Labour Force", "Unemployed")))
# race
csv %>%
count(race, sort = TRUE) %>%
filter(n > 1000)
#We can see that top 5 groups > 1000 people
csv <- csv %>%
filter(race %in% c("White only", "Black only", "Asian only", "American Indian, Alaskan Native", "White-American Indian"))
view(c)
#leisure hours aggregate
csv %>%
ggplot(mapping = aes(x = bls_leis)) +
geom_histogram(binwidth=50) +
labs(
title = "Distribution of daily individual leisure minutes",
x = "Daily Minutes",
y = "Count",
caption = "Data from ATUS",
)

csv %>%
filter(year %in% c(2007, 2009)) %>%
ggplot(mapping = aes(x = bls_leis)) +
geom_histogram(binwidth=50) +
facet_wrap(~year) +
labs(
title = "Distribution of daily individual leisure minutes",
x = "Daily Minutes",
y = "Count",
caption = "Data from ATUS",
)

#leisure hours aggregate by gender
csv %>%
filter(year < 2020) %>%
group_by(gender) %>%
ggplot(mapping = aes(x = bls_leis) )+
geom_histogram(binwidth=50) +
facet_wrap(~ gender) +
labs(
title = "Distribution of daily individual leisure minutes",
x = "Daily Minutes",
y = "Count",
caption = "Data from ATUS",
)

csv %>%
filter(year %in% c(2007, 2009)) %>%
ggplot(mapping = aes(x = bls_leis)) +
geom_histogram(binwidth=50) +
facet_wrap(~ gender + year) +
labs(
title = "Distribution of daily individual leisure minutes",
x = "Daily Minutes",
y = "Count",
caption = "Data from ATUS",
)

# Median
csv %>%
filter(year < 2020) %>%
group_by(gender, year) %>%
summarize(median_leisure = weighted.median(bls_leis, as.numeric(wt06), na.rm=TRUE)) %>%
clean_names() %>%
ggplot(aes(x = year, y = median_leisure)) +
geom_line() +
facet_wrap(~ gender) +
labs(
title = "Median daily individual leisure minutes by gender",
y = "Daily Minutes",
x = "Year",
caption = "Data from ATUS",
)
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.

# mean
csv %>%
filter(year < 2020) %>%
group_by(gender,year) %>%
summarize(new = weighted.mean(bls_leis, wt06)) %>%
clean_names() %>%
ggplot(aes(x = year, y = new)) +
geom_line() +
facet_wrap(~ gender) +
labs(
title = "Weighted mean daily individual leisure minutes by gender",
y = "Daily Minutes",
x = "Year",
caption = "Data from ATUS",
)
## `summarise()` has grouped output by 'gender'. You can override using the
## `.groups` argument.

# Median
csv %>%
filter(year > 2005) %>%
filter(year < 2011) %>%
group_by(year, gender) %>%
summarize(median_bls_leis = weighted.median(bls_leis, wt06)) %>%
clean_names() %>%
ggplot(aes(x = year, y = median_bls_leis)) +
geom_line() +
facet_wrap(~ gender) +
labs(
title = "Weighted median daily individual leisure minutes by gender",
y = "Daily Minutes",
x = "Year",
caption = "Data from ATUS",
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

# Mean
csv %>%
filter(year > 2005) %>%
filter(year < 2011) %>%
group_by(year, gender) %>%
summarize(mean_bls_leis = weighted.mean(bls_leis, wt06)) %>%
clean_names() %>%
ggplot(aes(x = year, y = mean_bls_leis)) +
geom_line() +
facet_wrap(~ gender) +
labs(
title = "Mean daily individual leisure minutes by gender",
y = "Daily Minutes",
x = "Year",
caption = "Data from ATUS",
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

# All Quantile
csv %>%
filter(year < 2020) %>%
group_by(year, gender) %>%
summarize(quantile_25 = weighted.quantile(bls_leis, wt06, prob = 0.25), quantile_50 = weighted.quantile(bls_leis, wt06, prob = 0.50), quantile_75 = weighted.quantile(bls_leis, wt06, prob = 0.75)) %>%
clean_names() %>%
pivot_longer(quantile_25:quantile_75, names_to = "quantile", values_to = "hours") %>%
ggplot(aes(x = year, y = hours, group = quantile, colour = quantile)) +
geom_line() +
facet_wrap(~ gender) +
labs(
title = "Weighted daily individual leisure minutes by gender",
y = "Daily Minutes",
x = "Year",
caption = "Data from ATUS",
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

# by year, gender
csv %>%
filter(year < 2020) %>%
group_by(year, gender) %>%
summarize(quantile_25 = weighted.quantile(bls_leis, wt06, prob = 0.25), quantile_50 = weighted.quantile(bls_leis, wt06, prob = 0.50), quantile_75 = weighted.quantile(bls_leis, wt06, prob = 0.75)) %>%
clean_names() %>%
pivot_longer(quantile_25:quantile_75, names_to = "quantile", values_to = "hours") %>%
filter(year > 2005) %>%
filter(year < 2011) %>%
ggplot(aes(x = year, y = hours, group = quantile, colour = quantile)) +
geom_line() +
facet_wrap(~gender) +
labs(
title = "Weighted daily individual leisure minutes by gender",
y = "Daily Minutes",
x = "Year",
caption = "Data from ATUS",
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

# by year, gender
csv %>%
group_by(year, gender) %>%
summarize(median = median(bls_leis)) %>%
clean_names() %>%
filter(year > 2005) %>%
filter(year < 2011) %>%
ggplot(aes(x = year, y = median)) +
geom_line() +
facet_wrap(~gender) +
labs(
title = "Daily individual leisure minutes by gender",
y = "Daily Minutes",
x = "Year",
caption = "Data from ATUS",
)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

#csv %>%
# ggplot(mapping = aes(x = year, y = bls_leis)) +
# geom_jitter(alpha = 0.01)
#naive regression of men versus women
g <- csv %>%
lm(formula = bls_leis ~ gender, weight = wt06)
summary(g)
##
## Call:
## lm(formula = bls_leis ~ gender, data = ., weights = wt06)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3873300 -353387 -33870 343917 6366061
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 291.1692 0.6646 438.10 <2e-16 ***
## gendermale 39.5660 0.9574 41.33 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 583600 on 206819 degrees of freedom
## (8382 observations deleted due to missingness)
## Multiple R-squared: 0.008191, Adjusted R-squared: 0.008186
## F-statistic: 1708 on 1 and 206819 DF, p-value: < 2.2e-16
#naive regression of men versus women + year
y <- csv %>%
lm(formula = bls_leis ~ gender + year, weight = wt06)
# race count
newcsv <- csv %>%
filter(race_code <= 210)
newcsv
newcsv %>%
lm(formula = bls_leis ~ gender + year + age + race, weight = wt06)
##
## Call:
## lm(formula = bls_leis ~ gender + year + age + race, data = .,
## weights = wt06)
##
## Coefficients:
## (Intercept) gendermale
## 16.98725 44.88100
## year age
## 0.07276 3.15759
## raceAsian only raceBlack only
## -65.93934 15.86296
## raceWhite only raceWhite-American Indian
## -28.02202 -10.83630
#
a <- newcsv %>%
lm(formula = bls_leis ~ gender + year + age + race, weight = wt06)
summary(a)
##
## Call:
## lm(formula = bls_leis ~ gender + year + age + race, data = .,
## weights = wt06)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3099397 -342415 -41506 308384 7238310
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.98725 176.24323 0.096 0.92321
## gendermale 44.88100 0.92435 48.554 < 2e-16 ***
## year 0.07276 0.08759 0.831 0.40618
## age 3.15759 0.02661 118.658 < 2e-16 ***
## raceAsian only -65.93934 5.79279 -11.383 < 2e-16 ***
## raceBlack only 15.86296 5.47901 2.895 0.00379 **
## raceWhite only -28.02202 5.34053 -5.247 1.55e-07 ***
## raceWhite-American Indian -10.83630 7.81604 -1.386 0.16562
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 562700 on 206813 degrees of freedom
## (8382 observations deleted due to missingness)
## Multiple R-squared: 0.07786, Adjusted R-squared: 0.07783
## F-statistic: 2494 on 7 and 206813 DF, p-value: < 2.2e-16
newcsv